Testing for just one site:

# log-log, with cohort fixed effects: for just one site

lm_mod_log2_cohort <- lm(log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l, site == "shaanxi"))
par(mfrow = c(2,2))
plot(lm_mod_log2_cohort)
## Warning: not plotting observations with leverage one:
##   350

summary(lm_mod_log2_cohort)
## 
## Call:
## lm(formula = log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l, 
##     site == "shaanxi"))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.049496 -0.013912 -0.000122  0.017841  0.106898 
## 
## Coefficients: (1 not defined because of singularities)
##                         Estimate Std. Error  t value Pr(>|t|)    
## log(time_1):cohort1988 -0.218069   0.002324  -93.844  < 2e-16 ***
## log(time_1):cohort1989 -0.223069   0.002404  -92.795  < 2e-16 ***
## log(time_1):cohort1990 -0.217128   0.002491  -87.176  < 2e-16 ***
## log(time_1):cohort1991 -0.254079   0.002585  -98.288  < 2e-16 ***
## log(time_1):cohort1992 -0.221388   0.002688  -82.360  < 2e-16 ***
## log(time_1):cohort1993 -0.290537   0.002801 -103.730  < 2e-16 ***
## log(time_1):cohort1994 -0.248796   0.002925  -85.053  < 2e-16 ***
## log(time_1):cohort1995 -0.195754   0.003063  -63.913  < 2e-16 ***
## log(time_1):cohort1996 -0.151948   0.003216  -47.247  < 2e-16 ***
## log(time_1):cohort1997 -0.144521   0.003388  -42.659  < 2e-16 ***
## log(time_1):cohort1998 -0.184442   0.003582  -51.493  < 2e-16 ***
## log(time_1):cohort1999 -0.141598   0.003803  -37.235  < 2e-16 ***
## log(time_1):cohort2000 -0.127973   0.004057  -31.545  < 2e-16 ***
## log(time_1):cohort2001 -0.088992   0.004352  -20.448  < 2e-16 ***
## log(time_1):cohort2002 -0.063893   0.004700  -13.594  < 2e-16 ***
## log(time_1):cohort2003 -0.068657   0.005116  -13.420  < 2e-16 ***
## log(time_1):cohort2004 -0.095691   0.005623  -17.018  < 2e-16 ***
## log(time_1):cohort2005 -0.091899   0.006254  -14.693  < 2e-16 ***
## log(time_1):cohort2006 -0.081600   0.007064  -11.552  < 2e-16 ***
## log(time_1):cohort2007 -0.080951   0.008139   -9.946  < 2e-16 ***
## log(time_1):cohort2008 -0.067471   0.009639   -7.000 1.46e-11 ***
## log(time_1):cohort2009 -0.052793   0.011875   -4.446 1.20e-05 ***
## log(time_1):cohort2010 -0.037733   0.015563   -2.424   0.0159 *  
## log(time_1):cohort2011 -0.036645   0.022761   -1.610   0.1084    
## log(time_1):cohort2012 -0.035781   0.042656   -0.839   0.4022    
## log(time_1):cohort2013        NA         NA       NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02957 on 326 degrees of freedom
## Multiple R-squared:  0.9956, Adjusted R-squared:  0.9953 
## F-statistic:  2969 on 25 and 326 DF,  p-value: < 2.2e-16
broom::tidy(lm_mod_log2_cohort)
broom::glance(lm_mod_log2_cohort)
formula(lm_mod_log2_cohort)
## log(proportion) ~ 0 + log(time_1):cohort
broom::augment(lm_mod_log2_cohort)
dev.off()
## null device 
##           1
# run lms for each site individually, store in lists.

# log-log model, with cohort fixed effects
lm_log2_l <- lapply(1:11, FUN = function(i) {
  lm(log(proportion) ~ 0 + log(time_1):cohort,  
               data = filter(dat_l, site == site_names[i]))
})

# log-log model, without cohort fixed effects
lm_log2_no_cohort_l <- lapply(1:11, FUN = function(i) {
  lm(log(proportion) ~ 0 + log(time_1),  
               data = filter(dat_l, site == site_names[i]))
})

# log-log model, with year_abn (continuous) rather than the categorical cohort
lm_log2_year_abn_l <- lapply(1:11, FUN = function(i) {
  lm(log(proportion) ~ 0  + log(time_1) + log(time_1):year_abn,  
               data = filter(dat_l, site == site_names[i]))
})

# lon-linear (exp. decay)
lm_log_lin_l <- lapply(1:11, FUN = function(i) {
  lm(log(proportion) ~ 0 + time:cohort,  
               data = filter(dat_l, site == site_names[i]))
})

# linear
lm_0_l <- lapply(1:11, FUN = function(i) {
  lm(log(proportion) ~ 0 + time,  
               data = filter(dat_l, site == site_names[i]))
})

names(lm_log2_l) <- site_names
names(lm_log2_no_cohort_l) <- site_names
names(lm_log2_year_abn_l) <- site_names
names(lm_log_lin_l) <- site_names
names(lm_0_l) <- site_names
# ----------------------- log-log cohort plot ----------------------- #

# plot with age on the x-axis
ggplot(mapping = aes(x = age, y = proportion, group = year_abn, color = year_abn)) + 
  geom_point(data = filter(dat_l, site == "shaanxi")) + 
  scale_color_distiller(palette = "Greens") + 
  scale_x_continuous(n.breaks = 30) + 
  labs(title = "Decay of abandoned land over time", #subtitle = site_df$description[indx], 
       caption = "lm formula: log(proportion) ~ 0 + log(time_1):cohort",
       x = "Time since initial abandonment (years)", y = "Proportion of abandoned land remaining",
       color = "Cohort (Year \nFirst Abandoned)") + 
  
  # add the lm curves, with predictions (this works because the "newdata" df has "age" in it, as well as "time_1")
  geom_line(data = mutate(augment(lm_mod_log2_cohort, newdata = filter(dat_l_0, site == "shaanxi")),
                          proportion = exp(.fitted))) + 
  
  # add the mean coefficient (needs to be transformed 4 to the right, so that plotting works)
  geom_function(fun = function(x) { (x-4) ^ (mean(coef(lm_log2_l$shaanxi), na.rm = TRUE))},
                mapping = aes(linetype = "Mean Decay Rate"),
                color = "blue", size = 1, #linetype = "dashed", 
                inherit.aes = FALSE) +
  scale_linetype(name = "")
## Warning in predict.lm(x, newdata = newdata, na.action = na.pass, ...):
## prediction from a rank-deficient fit may be misleading

# ----------------------------- #
# plot with time_1 on the x-axis

ggplot(data = filter(dat_l, site == "shaanxi"), 
       mapping = aes(x = time_1, y = proportion, group = year_abn, color = year_abn)) + 
  geom_point() + scale_color_distiller(palette = "Greens") + 
  geom_line(data = mutate(augment(lm_mod_log2_cohort), # or augment(lm_log2_l$shaanxi)
                          time_1 = exp(`log(time_1)`),
                          proportion = exp(.fitted),
                          year_abn = as.numeric(levels(cohort))[cohort])) + 
  scale_x_continuous(n.breaks = 30) + 
  
  # the mean coefficient
  geom_function(fun = function(x) { x ^ mean(coef(lm_mod_log2_cohort), na.rm = TRUE)},
                size = 1, linetype = "dashed", inherit.aes = FALSE)

your_path <- "/Users/christophercrawford/Desktop/test/"

# ----------------------- diagnostics plot, for all 11 sites in sequence ----------------------- #
lapply(1:11, FUN = function(i) {
  
  png(filename = paste0(your_path, "log2_cohort_diag_", names(lm_log2_l[i]),".png"), 
    width = 8, height = 8, units = "in", res = 400)
  par(mfrow = c(2,2))
  plot(lm_log2_l[[i]], 1, main = site_names[i])
  plot(lm_log2_l[[i]], 2)
  plot(lm_log2_l[[i]], 3)
  plot(lm_log2_l[[i]], 4)

  dev.off()
  
})
## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350

## Warning: not plotting observations with leverage one:
##   350
## [[1]]
## quartz_off_screen 
##                 2 
## 
## [[2]]
## quartz_off_screen 
##                 2 
## 
## [[3]]
## quartz_off_screen 
##                 2 
## 
## [[4]]
## quartz_off_screen 
##                 2 
## 
## [[5]]
## quartz_off_screen 
##                 2 
## 
## [[6]]
## quartz_off_screen 
##                 2 
## 
## [[7]]
## quartz_off_screen 
##                 2 
## 
## [[8]]
## quartz_off_screen 
##                 2 
## 
## [[9]]
## quartz_off_screen 
##                 2 
## 
## [[10]]
## quartz_off_screen 
##                 2 
## 
## [[11]]
## quartz_off_screen 
##                 2
# ----------------------- fitted vs. residuals diag plot, all 11 sites in one figure ----------------------- #
png(filename = paste0(your_path, "log2_cohort_diag_resid_v_fitted.png"), 
    width = 9, height = 8, units = "in", res = 400)
par(mfrow = c(3, 4))
for (i in 1:11) plot(lm_log2_l[[i]], 1, main = site_names[i])
dev.off()
## quartz_off_screen 
##                 2
# construct a model of *all* sites, with cohort and site fixed effects
str(dat_l)
## tibble [3,861 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ site            : Factor w/ 11 levels "belarus","bosnia_herzegovina",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year            : num [1:3861] 1992 1993 1993 1994 1994 ...
##  $ year_abn        : num [1:3861] 1988 1988 1989 1988 1989 ...
##  $ count           : num [1:3861] 2055644 1659519 1260207 1455200 1067262 ...
##  $ area_ha         : num [1:3861] 106770 86195 65455 75583 55433 ...
##  $ age             : num [1:3861] 5 6 5 7 6 5 8 7 6 5 ...
##  $ bins            : Factor w/ 5 levels "10 to 15 years",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ proportion      : num [1:3861] 1 0.807 1 0.708 0.847 ...
##  $ initial_area_abn: num [1:3861] 106770 106770 65455 106770 65455 ...
##  $ year_abn_bins   : Factor w/ 5 levels "(1993,1998]",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ time            : num [1:3861] 0 1 0 2 1 0 3 2 1 0 ...
##  $ time_1          : num [1:3861] 1 2 1 3 2 1 4 3 2 1 ...
##  $ cohort          : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   site = col_character(),
##   ..   year = col_double(),
##   ..   year_abn = col_double(),
##   ..   count = col_double(),
##   ..   area_ha = col_double(),
##   ..   age = col_double(),
##   ..   bins = col_character(),
##   ..   proportion = col_double(),
##   ..   initial_area_abn = col_double(),
##   ..   year_abn_bins = col_character(),
##   ..   time = col_double(),
##   ..   time_1 = col_double(),
##   ..   cohort = col_double()
##   .. )
# log-log model, with cohort and site fixed effects
lm_log2_cohort_all_site <- lm(log(proportion) ~ 0 + log(time_1):cohort:site, data = dat_l)


# ------------------------------------------------ #
# diagnostic plots
# ------------------------------------------------ #
par(mfrow = c(2,2))
plot(lm_log2_cohort_all_site, main = "Model with all sites, all cohorts")
## Warning: not plotting observations with leverage one:
##   350, 701, 1052, 1403, 1754, 2105, 2456, 2807, 3158, 3509, 3860

# now, without the outlier
par(mfrow = c(2,2))
plot(lm(log(proportion) ~ 0 + log(time_1):cohort:site, data = dat_l[-2092, ]), main = "Model with all sites, all cohorts")
## Warning: not plotting observations with leverage one:
##   350, 701, 1052, 1403, 1754, 2104, 2455, 2806, 3157, 3508, 3859

# now with a log(time) term and a linear time term: 
lm_mod_log2_cohort_plus <- lm(log(proportion) ~ 0 + log(time_1):cohort + time_1:cohort, data = filter(dat_l, site == "shaanxi"))

ggplot(data = filter(dat_l, site == "shaanxi"), 
       mapping = aes(x = time_1, y = proportion, group = year_abn, color = year_abn)) + 
  geom_point() + scale_color_distiller(palette = "Greens") + 
  geom_line(data = mutate(augment(lm_mod_log2_cohort_plus), # or augment(lm_log2_l$shaanxi)
                          time_1 = exp(`log(time_1)`),
                          proportion = exp(.fitted),
                          year_abn = as.numeric(levels(cohort))[cohort])) + 
  scale_x_continuous(n.breaks = 30)

par(mfrow = c(2, 2))
plot(lm_mod_log2_cohort_plus)
## Warning: not plotting observations with leverage one:
##   325, 350, 351
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

# residuals vs. covariates
lm_mod_log2_cohort
## 
## Call:
## lm(formula = log(proportion) ~ 0 + log(time_1):cohort, data = filter(dat_l, 
##     site == "shaanxi"))
## 
## Coefficients:
## log(time_1):cohort1988  log(time_1):cohort1989  log(time_1):cohort1990  
##               -0.21807                -0.22307                -0.21713  
## log(time_1):cohort1991  log(time_1):cohort1992  log(time_1):cohort1993  
##               -0.25408                -0.22139                -0.29054  
## log(time_1):cohort1994  log(time_1):cohort1995  log(time_1):cohort1996  
##               -0.24880                -0.19575                -0.15195  
## log(time_1):cohort1997  log(time_1):cohort1998  log(time_1):cohort1999  
##               -0.14452                -0.18444                -0.14160  
## log(time_1):cohort2000  log(time_1):cohort2001  log(time_1):cohort2002  
##               -0.12797                -0.08899                -0.06389  
## log(time_1):cohort2003  log(time_1):cohort2004  log(time_1):cohort2005  
##               -0.06866                -0.09569                -0.09190  
## log(time_1):cohort2006  log(time_1):cohort2007  log(time_1):cohort2008  
##               -0.08160                -0.08095                -0.06747  
## log(time_1):cohort2009  log(time_1):cohort2010  log(time_1):cohort2011  
##               -0.05279                -0.03773                -0.03664  
## log(time_1):cohort2012  log(time_1):cohort2013  
##               -0.03578                      NA
plot(lm_mod_log2_cohort)
## Warning: not plotting observations with leverage one:
##   350

augment(lm_mod_log2_cohort) %>% arrange(desc(.hat))
# just one site
resid_df1 <-  bind_cols(augment(lm_mod_log2_cohort), # with the tester model from above
                        filter(dat_l, site == "shaanxi"))
## New names:
## * cohort -> cohort...3
## * cohort -> cohort...21
# all sites individually, combined
resid_df <- lapply(lm_log2_l, augment) %>% bind_rows(.id = "site") %>%
  bind_cols(., dat_l)
## New names:
## * site -> site...1
## * cohort -> cohort...4
## * site -> site...10
## * cohort -> cohort...22
# residuals from the full model including all sites
resid_df_all <- bind_cols(dat_l, 
                          augment(lm_log2_cohort_all_site))
## New names:
## * site -> site...1
## * cohort -> cohort...13
## * cohort -> cohort...16
## * site -> site...17
# plot for just shaanxi
str(resid_df)
## tibble [3,861 × 22] (S3: tbl_df/tbl/data.frame)
##  $ site...1        : chr [1:3861] "belarus" "belarus" "belarus" "belarus" ...
##  $ log(proportion) : num [1:3861] 0 -0.214 0 -0.345 -0.166 ...
##  $ log(time_1)     : num [1:3861] 0 0.693 0 1.099 0.693 ...
##  $ cohort...4      : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
##  $ .fitted         : num [1:3861] 0 -0.257 0 -0.407 -0.249 ...
##  $ .std.resid      : num [1:3861] 4.03e-14 1.39 1.06e-14 2.00 2.68 ...
##  $ .hat            : num [1:3861] 0 0.00297 0 0.00746 0.00318 ...
##  $ .sigma          : num [1:3861] 0.0311 0.031 0.0311 0.0309 0.0308 ...
##  $ .cooksd         : num [1:3861] 0 0.000229 0 0.001207 0.000914 ...
##  $ site...10       : Factor w/ 11 levels "belarus","bosnia_herzegovina",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year            : num [1:3861] 1992 1993 1993 1994 1994 ...
##  $ year_abn        : num [1:3861] 1988 1988 1989 1988 1989 ...
##  $ count           : num [1:3861] 2055644 1659519 1260207 1455200 1067262 ...
##  $ area_ha         : num [1:3861] 106770 86195 65455 75583 55433 ...
##  $ age             : num [1:3861] 5 6 5 7 6 5 8 7 6 5 ...
##  $ bins            : Factor w/ 5 levels "10 to 15 years",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ proportion      : num [1:3861] 1 0.807 1 0.708 0.847 ...
##  $ initial_area_abn: num [1:3861] 106770 106770 65455 106770 65455 ...
##  $ year_abn_bins   : Factor w/ 5 levels "(1993,1998]",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ time            : num [1:3861] 0 1 0 2 1 0 3 2 1 0 ...
##  $ time_1          : num [1:3861] 1 2 1 3 2 1 4 3 2 1 ...
##  $ cohort...22     : Factor w/ 26 levels "1988","1989",..: 1 1 2 1 2 3 1 2 3 4 ...
##  - attr(*, "terms")=Classes 'terms', 'formula'  language log(proportion) ~ 0 + log(time_1):cohort
##   .. ..- attr(*, "variables")= language list(log(proportion), log(time_1), cohort)
##   .. ..- attr(*, "factors")= int [1:3, 1] 0 2 2
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "log(proportion)" "log(time_1)" "cohort"
##   .. .. .. ..$ : chr "log(time_1):cohort"
##   .. ..- attr(*, "term.labels")= chr "log(time_1):cohort"
##   .. ..- attr(*, "order")= int 2
##   .. ..- attr(*, "intercept")= int 0
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: 0x7f7eeab92240> 
##   .. ..- attr(*, "predvars")= language list(log(proportion), log(time_1), cohort)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "numeric" "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:3] "log(proportion)" "log(time_1)" "cohort"
gg_resid_time <- ggplot(data = resid_df, 
                        aes(x = time, y = .std.resid, color = year_abn)) + 
  geom_point() + 
  scale_color_distiller(palette = "Greens") + 
  labs(y = "Residuals", x = "Time Past Abandonment Threshold")

gg_resid_area <- ggplot(data = resid_df, 
                        aes(x = initial_area_abn, y = .std.resid, color = year_abn)) + 
  geom_point(position = position_jitter(width = 20)) + 
  scale_color_distiller(palette = "Greens") + 
  labs(y = "Residuals", x = "Initial Area Abandoned") + 
  theme(legend.position = "none")



#install.packages("cowplot")
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
# just shaanxi
cowplot::plot_grid(gg_resid_time, gg_resid_area, rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).

# all sites
cowplot::plot_grid(
  gg_resid_time %+% resid_df[-2092, ], # without outlier
  gg_resid_area %+% resid_df[-2092, ], # without outlier
  rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).

## Warning: Removed 11 rows containing missing values (geom_point).

# residuals from the model with all sites: 
cowplot::plot_grid(
  gg_resid_time %+% resid_df_all[-2092, ], # without outlier
  gg_resid_area %+% resid_df_all[-2092, ], # without outlier
  rel_widths = c(1.2, 1))
## Warning: Removed 11 rows containing missing values (geom_point).

## Warning: Removed 11 rows containing missing values (geom_point).

# to plot specific sites:
cowplot::plot_grid(
  gg_resid_time %+% filter(resid_df[-2092, ], site...1 == "shaanxi"), # without outlier
  gg_resid_area %+% filter(resid_df[-2092, ], site...1 == "shaanxi"), # without outlier
  rel_widths = c(1.2, 1))
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).